Script to reproduce years based on a model trained with random points¶

Importing¶

In [ ]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import root_mean_squared_error as rmse

from tqdm import tqdm

import dill
import random

import salishsea_tools.viz_tools as sa_vi

Datasets Preparation¶

In [ ]:
def datasets_preparation(dataset, dataset2):
    
    drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
        np.ravel(dataset['Temperature_(15m-100m)']), 
        np.ravel(dataset['Salinity_(0m-15m)']),
        np.ravel(dataset['Salinity_(15m-100m)']),
        np.ravel(dataset2['Summation_of_solar_radiation']),
        np.ravel(dataset2['Mean_wind_speed']),
        np.ravel(dataset2['Mean_air_temperature'])
        ])
    indx = np.where(~np.isnan(drivers).any(axis=0))
    drivers = drivers[:,indx[0]]

    diat = np.ravel(dataset['Diatom'])
    diat = diat[indx[0]]

    return(drivers, diat, indx)

Regressor¶

In [ ]:
def regressor (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    X_train, _, y_train, _ = train_test_split(inputs, targets, train_size=0.35)

    model = MLPRegressor(hidden_layer_sizes=200)
    model = make_pipeline(StandardScaler(), model)
    regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(X_train, y_train)

    return (regr)

Regressor 2¶

In [ ]:
def regressor2 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    # scale = preprocessing.StandardScaler()
    # inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs)
   
    m = scatter_plot(targets, outputs_test, variable_name) 
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor 3¶

In [ ]:
def regressor3 (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    # scale = preprocessing.StandardScaler()
    # inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs)
   
    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs_test, deg=1)
    
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor 4¶

In [ ]:
def regressor4 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    # scale = preprocessing.StandardScaler()
    # inputs2 = scale.fit_transform(inputs)

    outputs = regr.predict(inputs)

    # Post processing
    indx2 = np.full((len(diat_i.y)*len(diat_i.x)),np.nan)
    indx2[indx[0]] = outputs
    model = np.reshape(indx2,(len(diat_i.y),len(diat_i.x)))

    m = scatter_plot(targets, outputs, variable_name + str(dates[i].date())) 

    # Preparation of the dataarray 
    model = xr.DataArray(model,
        coords = {'y': diat_i.y, 'x': diat_i.x},
        dims = ['y','x'],
        attrs=dict( long_name = variable_name + "Concentration",
        units="mmol m-2"),)
                        
    plotting3(targets, model, diat_i, variable_name)

Printing¶

In [ ]:
def printing (targets, outputs, m):

    print ('The amount of data points is', outputs.size)
    print ('The slope of the best fitting line is ', np.round(m,3))
    print ('The correlation coefficient is:', np.round(np.corrcoef(targets, outputs)[0][1],3))
    print (' The mean square error is:', rmse(targets,outputs))

Scatter Plot¶

In [ ]:
def scatter_plot(targets, outputs, variable_name):

    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs, deg=1)

    printing(targets, outputs, m)

    fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')

    ax[0].scatter(targets,outputs, alpha = 0.2, s = 10)

    lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
        np.max([ax[0].get_xlim(), ax[0].get_ylim()])]

    # plot fitted y = m*x + b
    ax[0].axline(xy1=(0, b), slope=m, color='r')

    ax[0].set_xlabel('targets')
    ax[0].set_ylabel('outputs')
    ax[0].set_xlim(lims)
    ax[0].set_ylim(lims)
    ax[0].set_aspect('equal')

    ax[0].plot(lims, lims,linestyle = '--',color = 'k')

    h = ax[1].hist2d(targets,outputs, bins=100, cmap='jet', 
        range=[lims,lims], cmin=0.1, norm='log')
    
    ax[1].plot(lims, lims,linestyle = '--',color = 'k')

    # plot fitted y = m*x + b
    ax[1].axline(xy1=(0, b), slope=m, color='r')

    ax[1].set_xlabel('targets')
    ax[1].set_ylabel('outputs')
    ax[1].set_aspect('equal')

    fig.colorbar(h[3],ax=ax[1], location='bottom')

    fig.suptitle(variable_name)

    plt.show()

    return (m)

Plotting¶

In [ ]:
def plotting(variable, name):

    plt.plot(years,variable, marker = '.', linestyle = '')
    plt.xlabel('Years')
    plt.ylabel(name)
    plt.show()

Plotting 2¶

In [ ]:
def plotting2(variable,title):
    
    fig, ax = plt.subplots()

    scatter= ax.scatter(dates,variable, marker='.', c=pd.DatetimeIndex(dates).month)

    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
    fig.suptitle('Daily ' + title + ' (15 Feb - 30 Apr)')
    
    fig.show()

Plotting 3¶

In [ ]:
def plotting3(targets, model, variable, variable_name):

    fig, ax = plt.subplots(2,2, figsize = (10,15))

    cmap = plt.get_cmap('cubehelix')
    cmap.set_bad('gray')

    variable.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax =targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    model.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    ((variable-model) / variable * 100).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': variable_name + ' Concentration  [percentage]'})

    plt.subplots_adjust(left=0.1,
        bottom=0.1, 
        right=0.95, 
        top=0.95, 
        wspace=0.35, 
        hspace=0.35)

    sa_vi.set_aspect(ax[0,0])
    sa_vi.set_aspect(ax[0,1])
    sa_vi.set_aspect(ax[1,0])


    ax[0,0].title.set_text(variable_name + ' (targets)')
    ax[0,1].title.set_text(variable_name + ' (outputs)')
    ax[1,0].title.set_text('targets - outputs')
    ax[1,1].axis('off')

    fig.suptitle(str(dates[i].date()))

    plt.show()
    

Training (Random Points)¶

In [ ]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_model_var_old.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/analysis-ilias/notebooks/external_inputs.nc')

ds = ds.isel(time_counter = (np.arange(0, len(ds.time_counter),2)), 
    y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds2 = ds2.isel(time_counter = (np.arange(0, len(ds2.time_counter),2)), 
    y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
    x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

dates = pd.DatetimeIndex(ds['time_counter'].values)

drivers, diat, _ = datasets_preparation(ds, ds2)

regr = regressor(drivers, diat)

Other Years (Anually)¶

In [ ]:
years = range (2007,2024)

r_all = []
rms_all = []
slope_all = []

for year in range (2007,2024):
    
    dataset = ds.sel(time_counter=str(year))
    dataset2 = ds2.sel(time_counter=str(year))
    
    drivers, diat, _ = datasets_preparation(dataset, dataset2)

    r, rms, m = regressor2(drivers, diat, 'Diatom ' + str(year))
    
    r_all.append(r)
    rms_all.append(rms)
    slope_all.append(m)
    
plotting(np.transpose(r_all), 'Correlation Coefficient')
plotting(np.transpose(rms_all), 'Root Mean Square Error')
plotting (np.transpose(slope_all), 'Slope of the best fitting line')
The amount of data points is 70794
The slope of the best fitting line is  0.474
The correlation coefficient is: 0.735
 The mean square error is: 0.10892511046778412
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.517
The correlation coefficient is: 0.717
 The mean square error is: 0.10201272586790976
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.511
The correlation coefficient is: 0.78
 The mean square error is: 0.12915558425003135
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.422
The correlation coefficient is: 0.68
 The mean square error is: 0.1082668110413645
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.604
The correlation coefficient is: 0.811
 The mean square error is: 0.09297382289081609
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.61
The correlation coefficient is: 0.813
 The mean square error is: 0.0942724869175936
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.438
The correlation coefficient is: 0.728
 The mean square error is: 0.12753845839314087
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.461
The correlation coefficient is: 0.657
 The mean square error is: 0.11012888183953273
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.311
The correlation coefficient is: 0.574
 The mean square error is: 0.12488661757637735
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.524
The correlation coefficient is: 0.772
 The mean square error is: 0.1082596163796345
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.544
The correlation coefficient is: 0.72
 The mean square error is: 0.09355425751063623
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.384
The correlation coefficient is: 0.611
 The mean square error is: 0.1260795821851919
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.465
The correlation coefficient is: 0.672
 The mean square error is: 0.12852811618018917
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.386
The correlation coefficient is: 0.705
 The mean square error is: 0.15057420627549628
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.571
The correlation coefficient is: 0.801
 The mean square error is: 0.10507208144107158
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.515
The correlation coefficient is: 0.706
 The mean square error is: 0.10425018273783825
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.375
The correlation coefficient is: 0.606
 The mean square error is: 0.13400448449315852
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Other Years (Daily)¶

In [ ]:
r_all2 = np.array([])
rms_all2 = np.array([])
slope_all2 = np.array([])

for i in tqdm(range (0, len(ds.time_counter))):
    
    dataset = ds.isel(time_counter=i)
    dataset2 = ds2.isel(time_counter=i)

    drivers, diat, _ = datasets_preparation(dataset, dataset2)

    r, rms, m = regressor3(drivers, diat)

    r_all2 = np.append(r_all2,r)
    rms_all2 = np.append(rms_all2,rms)
    slope_all2 = np.append(slope_all2,m)

plotting2(r_all2, 'Correlation Coefficients')
plotting2(rms_all2, 'Root Mean Square Errors')
plotting2(slope_all2, 'Slope of the best fitting line')
100%|██████████| 640/640 [00:27<00:00, 23.21it/s]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Daily Maps¶

In [ ]:
maps = random.sample(range(0,len(ds.time_counter)),10)

for i in maps:

    dataset = ds.isel(time_counter=i)
    dataset2 = ds2.isel(time_counter=i)
    drivers, diat, indx = datasets_preparation(dataset, dataset2)

    diat_i = dataset['Diatom']

    regressor4(drivers, diat, 'Diatom ')
The amount of data points is 1863
The slope of the best fitting line is  0.502
The correlation coefficient is: 0.535
 The mean square error is: 0.0722795124012919
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.196
The correlation coefficient is: 0.26
 The mean square error is: 0.1025743962578936
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.57
The correlation coefficient is: 0.71
 The mean square error is: 0.06427547912385609
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.673
The correlation coefficient is: 0.668
 The mean square error is: 0.041535064743133746
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.314
The correlation coefficient is: 0.417
 The mean square error is: 0.05614134053079165
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.301
The correlation coefficient is: 0.72
 The mean square error is: 0.16718988959731293
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.826
The correlation coefficient is: 0.737
 The mean square error is: 0.04817263224283529
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.19
The correlation coefficient is: 0.376
 The mean square error is: 0.11338250876852003
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.296
The correlation coefficient is: 0.442
 The mean square error is: 0.049893842390791227
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.163
The correlation coefficient is: 0.371
 The mean square error is: 0.16255692763351973
No description has been provided for this image
No description has been provided for this image
In [ ]: